{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "特征工程系列：聚合特征构造以及转换特征构造  https://www.cnblogs.com/purple5252/p/11812067.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>col_0</th>\n",
       "      <th>col_1</th>\n",
       "      <th>col_2</th>\n",
       "      <th>col_3</th>\n",
       "      <th>col_4</th>\n",
       "      <th>col_5</th>\n",
       "      <th>col_6</th>\n",
       "      <th>col_7</th>\n",
       "      <th>col_8</th>\n",
       "      <th>col_9</th>\n",
       "      <th>col_10</th>\n",
       "      <th>col_11</th>\n",
       "      <th>col_12</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00632</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.31</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>6.575</td>\n",
       "      <td>65.2</td>\n",
       "      <td>4.0900</td>\n",
       "      <td>1.0</td>\n",
       "      <td>296.0</td>\n",
       "      <td>15.3</td>\n",
       "      <td>396.90</td>\n",
       "      <td>4.98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.02731</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>6.421</td>\n",
       "      <td>78.9</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2.0</td>\n",
       "      <td>242.0</td>\n",
       "      <td>17.8</td>\n",
       "      <td>396.90</td>\n",
       "      <td>9.14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.02729</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>7.185</td>\n",
       "      <td>61.1</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2.0</td>\n",
       "      <td>242.0</td>\n",
       "      <td>17.8</td>\n",
       "      <td>392.83</td>\n",
       "      <td>4.03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.03237</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>6.998</td>\n",
       "      <td>45.8</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3.0</td>\n",
       "      <td>222.0</td>\n",
       "      <td>18.7</td>\n",
       "      <td>394.63</td>\n",
       "      <td>2.94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.06905</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>7.147</td>\n",
       "      <td>54.2</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3.0</td>\n",
       "      <td>222.0</td>\n",
       "      <td>18.7</td>\n",
       "      <td>396.90</td>\n",
       "      <td>5.33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>501</th>\n",
       "      <td>0.06263</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11.93</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.573</td>\n",
       "      <td>6.593</td>\n",
       "      <td>69.1</td>\n",
       "      <td>2.4786</td>\n",
       "      <td>1.0</td>\n",
       "      <td>273.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>391.99</td>\n",
       "      <td>9.67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>502</th>\n",
       "      <td>0.04527</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11.93</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.573</td>\n",
       "      <td>6.120</td>\n",
       "      <td>76.7</td>\n",
       "      <td>2.2875</td>\n",
       "      <td>1.0</td>\n",
       "      <td>273.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>396.90</td>\n",
       "      <td>9.08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>503</th>\n",
       "      <td>0.06076</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11.93</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.573</td>\n",
       "      <td>6.976</td>\n",
       "      <td>91.0</td>\n",
       "      <td>2.1675</td>\n",
       "      <td>1.0</td>\n",
       "      <td>273.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>396.90</td>\n",
       "      <td>5.64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>504</th>\n",
       "      <td>0.10959</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11.93</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.573</td>\n",
       "      <td>6.794</td>\n",
       "      <td>89.3</td>\n",
       "      <td>2.3889</td>\n",
       "      <td>1.0</td>\n",
       "      <td>273.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>393.45</td>\n",
       "      <td>6.48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>505</th>\n",
       "      <td>0.04741</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11.93</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.573</td>\n",
       "      <td>6.030</td>\n",
       "      <td>80.8</td>\n",
       "      <td>2.5050</td>\n",
       "      <td>1.0</td>\n",
       "      <td>273.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>396.90</td>\n",
       "      <td>7.88</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>506 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       col_0  col_1  col_2  col_3  col_4  col_5  col_6   col_7  col_8  col_9  \\\n",
       "0    0.00632   18.0   2.31    0.0  0.538  6.575   65.2  4.0900    1.0  296.0   \n",
       "1    0.02731    0.0   7.07    0.0  0.469  6.421   78.9  4.9671    2.0  242.0   \n",
       "2    0.02729    0.0   7.07    0.0  0.469  7.185   61.1  4.9671    2.0  242.0   \n",
       "3    0.03237    0.0   2.18    0.0  0.458  6.998   45.8  6.0622    3.0  222.0   \n",
       "4    0.06905    0.0   2.18    0.0  0.458  7.147   54.2  6.0622    3.0  222.0   \n",
       "..       ...    ...    ...    ...    ...    ...    ...     ...    ...    ...   \n",
       "501  0.06263    0.0  11.93    0.0  0.573  6.593   69.1  2.4786    1.0  273.0   \n",
       "502  0.04527    0.0  11.93    0.0  0.573  6.120   76.7  2.2875    1.0  273.0   \n",
       "503  0.06076    0.0  11.93    0.0  0.573  6.976   91.0  2.1675    1.0  273.0   \n",
       "504  0.10959    0.0  11.93    0.0  0.573  6.794   89.3  2.3889    1.0  273.0   \n",
       "505  0.04741    0.0  11.93    0.0  0.573  6.030   80.8  2.5050    1.0  273.0   \n",
       "\n",
       "     col_10  col_11  col_12  \n",
       "0      15.3  396.90    4.98  \n",
       "1      17.8  396.90    9.14  \n",
       "2      17.8  392.83    4.03  \n",
       "3      18.7  394.63    2.94  \n",
       "4      18.7  396.90    5.33  \n",
       "..      ...     ...     ...  \n",
       "501    21.0  391.99    9.67  \n",
       "502    21.0  396.90    9.08  \n",
       "503    21.0  396.90    5.64  \n",
       "504    21.0  393.45    6.48  \n",
       "505    21.0  396.90    7.88  \n",
       "\n",
       "[506 rows x 13 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#构造数据\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.datasets import load_boston\n",
    "\n",
    "X,y = load_boston(return_X_y = True)\n",
    "X = pd.DataFrame(X,columns=['col_'+ str(i) for i in range(X.shape[1]) ])\n",
    "y = pd.DataFrame(y,columns=['target'])\n",
    "\n",
    "#构建一个训练集和测试集，测试集是没有target的\n",
    "train = pd.concat([X,y],axis=1)\n",
    "test = X.iloc[:10,:]\n",
    "\n",
    "x_train = X.copy()\n",
    "y_train = y.copy()\n",
    "data = x_train.copy()\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征构造的方式"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "非监督  \n",
    "等频、\n",
    "等距、\n",
    "kmeans、\n",
    "\n",
    "监督：  \n",
    "best_KS、\n",
    "Chi、"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 自定义分箱"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    372\n",
       "4    101\n",
       "3     23\n",
       "2     10\n",
       "1      0\n",
       "Name: col_1_bucket, dtype: int64"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 对可分类的连续特征进行分桶，kilometer是已经分桶了\n",
    "col_1_bucket=pd.cut(\n",
    "        data.col_1, \n",
    "        bins = [float('-inf'), 5,10,15,20,float('inf')],\n",
    "        labels=[0, 1, 2, 3, 4, ])\n",
    "data = data.assign(col_1_bucket = col_1_bucket)\n",
    "\n",
    "data['col_1_bucket'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 等距分箱"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "strategy='uniform'  \n",
    "uniform:等距离  \n",
    "quantile：等频率  \n",
    "kmeans：Kmeans聚类分箱      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    384\n",
       "1.0     64\n",
       "4.0     29\n",
       "2.0     19\n",
       "3.0     10\n",
       "Name: col_1_bucket, dtype: int64"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import KBinsDiscretizer\n",
    "est = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')\n",
    "est.fit(data['col_1'].values.reshape(-1,1))\n",
    "col_1_bucket = est.transform(data['col_1'].values.reshape(-1,1))\n",
    "data = data.assign(col_1_bucket = col_1_bucket)\n",
    "\n",
    "data['col_1_bucket'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
